-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[LV] Consider interleaving & tail-folding when -enable-wide-lane-mask=true #163387
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[LV] Consider interleaving & tail-folding when -enable-wide-lane-mask=true #163387
Conversation
@llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-llvm-transforms Author: Kerry McLaughlin (kmclaughlin-arm) ChangesCurrently the only way to enable the use of wide active lane masks is to pass Basic cost model changes are also included which reduce the cost of the Patch is 32.39 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/163387.diff 7 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 5d3b233ed6b6a..14c6125bb1dbf 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -209,9 +209,10 @@ struct TailFoldingInfo {
TargetLibraryInfo *TLI;
LoopVectorizationLegality *LVL;
InterleavedAccessInfo *IAI;
+ bool UseWideLaneMask;
TailFoldingInfo(TargetLibraryInfo *TLI, LoopVectorizationLegality *LVL,
- InterleavedAccessInfo *IAI)
- : TLI(TLI), LVL(LVL), IAI(IAI) {}
+ InterleavedAccessInfo *IAI, bool UseWideLaneMask = false)
+ : TLI(TLI), LVL(LVL), IAI(IAI), UseWideLaneMask(UseWideLaneMask) {}
};
class TargetTransformInfo;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 479e34515fc8a..742327635c178 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -957,10 +957,18 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
return TyL.first + ExtraCost;
}
case Intrinsic::get_active_lane_mask: {
- auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType());
- if (RetTy) {
- EVT RetVT = getTLI()->getValueType(DL, RetTy);
- EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+ auto RetTy = cast<VectorType>(ICA.getReturnType());
+ EVT RetVT = getTLI()->getValueType(DL, RetTy);
+ EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+ if (RetTy->isScalableTy()) {
+ if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) ||
+ (!ST->hasSVE2p1() && !ST->hasSME2()) ||
+ TLI->getTypeAction(RetTy->getContext(), RetVT) !=
+ TargetLowering::TypeSplitVector)
+ break;
+ auto LT = getTypeLegalizationCost(RetTy);
+ return LT.first / 2;
+ } else {
if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
!getTLI()->isTypeLegal(RetVT)) {
// We don't have enough context at this point to determine if the mask
@@ -972,7 +980,7 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
// NOTE: getScalarizationOverhead returns a cost that's far too
// pessimistic for the actual generated codegen. In reality there are
// two instructions generated per lane.
- return RetTy->getNumElements() * 2;
+ return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
}
}
break;
@@ -6146,8 +6154,11 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {
if (Required == TailFoldingOpts::Disabled)
Required |= TailFoldingOpts::Simple;
- if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
- Required))
+ TailFoldingOpts DefaultOpts = ST->getSVETailFoldingDefaultOpts();
+ if (TFI->UseWideLaneMask)
+ DefaultOpts |= TailFoldingOpts::Simple;
+
+ if (!TailFoldingOptionLoc.satisfies(DefaultOpts, Required))
return false;
// Don't tail-fold for tight loops where we would be better off interleaving
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b96d29e635465..5b487586169e6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -249,6 +249,10 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
"Use predicated EVL instructions for tail folding. If EVL "
"is unsupported, fallback to data-without-lane-mask.")));
+cl::opt<bool> llvm::EnableWideActiveLaneMask(
+ "enable-wide-lane-mask", cl::init(false), cl::Hidden,
+ cl::desc("Enable use of wide get active lane mask instructions"));
+
static cl::opt<bool> MaximizeBandwidth(
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
cl::desc("Maximize bandwidth when selecting vectorization factor which "
@@ -1346,6 +1350,15 @@ class LoopVectorizationCostModel {
return getTailFoldingStyle() != TailFoldingStyle::None;
}
+ bool useWideActiveLaneMask() const {
+ if (!EnableWideActiveLaneMask)
+ return false;
+
+ TailFoldingStyle TF = getTailFoldingStyle();
+ return TF == TailFoldingStyle::DataAndControlFlow ||
+ TF == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
+ }
+
/// Return maximum safe number of elements to be processed per vector
/// iteration, which do not prevent store-load forwarding and are safe with
/// regard to the memory dependencies. Required for EVL-based VPlans to
@@ -4518,7 +4531,7 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
// 3. We don't interleave if we think that we will spill registers to memory
// due to the increased register pressure.
- if (!CM.isScalarEpilogueAllowed())
+ if (!CM.isScalarEpilogueAllowed() && !CM.useWideActiveLaneMask())
return 1;
if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
@@ -8995,7 +9008,7 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
};
// 4) if the TTI hook indicates this is profitable, request predication.
- TailFoldingInfo TFI(TLI, &LVL, IAI);
+ TailFoldingInfo TFI(TLI, &LVL, IAI, EnableWideActiveLaneMask);
if (TTI->preferPredicateOverEpilogue(&TFI))
return CM_ScalarEpilogueNotNeededUsePredicate;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 40b7e8df7aec9..d8a20958b8718 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -40,10 +40,6 @@
using namespace llvm;
using namespace VPlanPatternMatch;
-static cl::opt<bool> EnableWideActiveLaneMask(
- "enable-wide-lane-mask", cl::init(false), cl::Hidden,
- cl::desc("Enable use of wide get active lane mask instructions"));
-
bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
VPlan &Plan,
function_ref<const InductionDescriptor *(PHINode *)>
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 5a8a2bbc2975e..50eefbdcee14a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -32,6 +32,7 @@ class VPRecipeBuilder;
struct VFRange;
extern cl::opt<bool> VerifyEachVPlan;
+extern cl::opt<bool> EnableWideActiveLaneMask;
struct VPlanTransforms {
/// Helper to run a VPlan transform \p Transform on \p VPlan, forwarding extra
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index 609a23bc07938..289dfbf5fc5dc 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefix=CHECK-VSCALE-1
-; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -mcpu=neoverse-v1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefix=CHECK-VSCALE-2
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -mcpu=neoverse-v1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefix=CHECK-VSCALE-2 --check-prefix=CHECK-SVE
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -mcpu=neoverse-v1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve2p1 | FileCheck %s --check-prefix=CHECK-VSCALE-2 --check-prefix=CHECK-SVE2p1-OR-SME2
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -mcpu=neoverse-v1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sme2 | FileCheck %s --check-prefix=CHECK-VSCALE-2 --check-prefix=CHECK-SVE2p1-OR-SME2
; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -intrinsic-cost-strategy=type-based-intrinsic-cost -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefix=TYPE_BASED_ONLY
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
@@ -920,6 +922,7 @@ define void @get_lane_mask() #0 {
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 undef, i32 undef)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 undef, i32 undef)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 undef, i32 undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 4 for: %mask_nxv64i1_i64 = call <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64 undef, i64 undef)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 undef, i64 undef)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 undef, i16 undef)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
@@ -934,28 +937,53 @@ define void @get_lane_mask() #0 {
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
-; CHECK-VSCALE-2-LABEL: 'get_lane_mask'
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 undef, i16 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-SVE-LABEL: 'get_lane_mask'
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 undef, i64 undef)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 undef, i64 undef)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 undef, i64 undef)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 undef, i32 undef)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 undef, i32 undef)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 undef, i32 undef)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 undef, i32 undef)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 4 for: %mask_nxv64i1_i64 = call <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64 undef, i64 undef)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 undef, i64 undef)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 undef, i16 undef)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
+; CHECK-SVE-NEXT: Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
+; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-SVE2p1-OR-SME2-LABEL: 'get_lane_mask'
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 undef, i64 undef)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 undef, i64 undef)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 undef, i64 undef)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 undef, i32 undef)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 undef, i32 undef)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 undef, i32 undef)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 undef, i32 undef)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 2 for: %mask_nxv64i1_i64 = call <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64 undef, i64 undef)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 undef, i64 undef)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 undef, i16 undef)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
+; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; TYPE_BASED_ONLY-LABEL: 'get_lane_mask'
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef)
@@ -966,6 +994,7 @@ define void @get_lane_mask() #0 {
; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 und...
[truncated]
|
✅ With the latest revision this PR passed the undef deprecator. |
…=true Currently the only way to enable the use of wide active lane masks is to pass -enable-wide-lane-mask and force both interleaving & tail-folding with additional flags. This patch changes selectInterleaveCount & preferPredicateOverEpilogue to consider both interleaving and tail-folding if wide lane masks were requested, although the feature remains off by default. Basic cost model changes are also included which reduce the cost of the get.active.lane.mask intrinsic when the return type would require splitting, but we know the whilelo (predicate pair) instruction can be used.
8fa2cc5
to
442a7d1
Compare
TailFoldingInfo(TargetLibraryInfo *TLI, LoopVectorizationLegality *LVL, | ||
InterleavedAccessInfo *IAI) | ||
: TLI(TLI), LVL(LVL), IAI(IAI) {} | ||
InterleavedAccessInfo *IAI, bool UseWideLaneMask = false) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This flag is being set by a loop vectoriser flag called 'EnableWideLaneMask', which to me isn't the same as 'UseWideLaneMask'. The latter makes it sound like a decision has already been made, whereas the former sounds more like a possibility if the target wishes to use them.
|
||
if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(), | ||
Required)) | ||
TailFoldingOpts DefaultOpts = ST->getSVETailFoldingDefaultOpts(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure about this behaviour to be honest. Enabling the use of wide lane masks itself doesn't automatically imply that we should tail-fold all simple loops. I see two issues here:
- I think if we want to go down the route of forcing tail-folding with wide lane masks then whatever flags we use should probably be named more appropriately, i.e. ForceTailFoldingWithWideLaneMasks or something like that.
- It gives users no option to only enable tail-folding with wide lane masks for reductions, etc. For example, -mllvm -enable-wide-lane-masks -mllvm -sve-tail-folding=reductions will never work.
If you want a way to force simple tail-folding with wide lane masks it might be better to use a target flag that lives in this file. For example, you could add a new option to -sve-tail-folding
, i.e. something like -sve-tail-folding=simple+widelanemasks. This way also gives you the option of getting more fine-grained testing with just reductions, recurrences, etc.
You could rename the loop vectoriser flag EnableWideLaneMasks
to ForceWideLaneMasks
or UseWideLaneMasks
so that there is still a way to test this for other targets, but it would be off by default. If the flag is off then the target would have another chance to opt in to this based on their preference. I guess you'd also have to change this interface to return an enum rather than a boolean. How about something like
enum PreferredPredicationStyle {
None,
PredicatedBody,
PredicatedBodyWideLaneMasks
};
or something like that?
if (TFI->UseWideLaneMask) | ||
DefaultOpts |= TailFoldingOpts::Simple; | ||
|
||
if (!TailFoldingOptionLoc.satisfies(DefaultOpts, Required)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure it's a good idea to ignore the instruction threshold below because the problem that exists for normal tail-folding will also exist for wide lane masks. If the user really wants to test the special case of tail-folding for small loops they can always do it in conjunction with -sve-tail-folding-insn-threshold=0.
// due to the increased register pressure. | ||
|
||
if (!CM.isScalarEpilogueAllowed()) | ||
if (!CM.isScalarEpilogueAllowed() && !CM.useWideActiveLaneMask()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This deserves some explanation, why useWideActiveLaneMask
should be treated differently. This check may serve as proxy for optimizing for size. Could you add a test to make sure we do not interleave with wide active lane masks when optimizing for size?
Currently the only way to enable the use of wide active lane masks is to pass
-enable-wide-lane-mask and force both interleaving & tail-folding with additional
flags. This patch changes selectInterleaveCount & preferPredicateOverEpilogue to
consider both interleaving and tail-folding if wide lane masks were requested,
although the feature remains off by default.
Basic cost model changes are also included which reduce the cost of the
get.active.lane.mask intrinsic when the return type would require splitting,
but we know the whilelo (predicate pair) instruction can be used.